library(tidyverse) # coding with tidy style ## ── Attaching packages ────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(lubridate) # processing time##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(igraph) # plotting network##
## Attaching package: 'igraph'
## The following objects are masked from 'package:lubridate':
##
## %--%, union
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
##
## compose, simplify
## The following object is masked from 'package:tidyr':
##
## crossing
## The following object is masked from 'package:tibble':
##
## as_data_frame
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
options(stringsAsFactors = F) # avoid converting character to factor silentlyusers_1 <- read_csv("data/china_082019_1_users_csv_hashed.csv")## Parsed with column specification:
## cols(
## userid = col_character(),
## user_display_name = col_character(),
## user_screen_name = col_character(),
## user_reported_location = col_character(),
## user_profile_description = col_character(),
## user_profile_url = col_character(),
## follower_count = col_double(),
## following_count = col_double(),
## account_creation_date = col_date(format = ""),
## account_language = col_character()
## )
tweets_1 <- read_csv("data/china_082019_1_tweets_csv_hashed.csv")## Parsed with column specification:
## cols(
## .default = col_character(),
## tweetid = col_double(),
## userid = col_double(),
## user_profile_url = col_logical(),
## follower_count = col_double(),
## following_count = col_double(),
## account_creation_date = col_date(format = ""),
## tweet_time = col_datetime(format = ""),
## in_reply_to_userid = col_double(),
## in_reply_to_tweetid = col_double(),
## quoted_tweet_tweetid = col_double(),
## is_retweet = col_logical(),
## retweet_userid = col_logical(),
## retweet_tweetid = col_logical(),
## quote_count = col_double(),
## reply_count = col_double(),
## like_count = col_double(),
## retweet_count = col_double(),
## poll_choices = col_logical()
## )
## See spec(...) for full column specifications.
## Warning: 951646 parsing failures.
## row col expected actual file
## 1003 retweet_userid 1/0/T/F/TRUE/FALSE 2896172507 'data/china_082019_1_tweets_csv_hashed.csv'
## 1003 retweet_tweetid 1/0/T/F/TRUE/FALSE 581033920591859712 'data/china_082019_1_tweets_csv_hashed.csv'
## 1052 userid a double nXJwGmcG71Ho3srqnXTSnyEokZXke8tkdISrrycI= 'data/china_082019_1_tweets_csv_hashed.csv'
## 1052 user_profile_url 1/0/T/F/TRUE/FALSE http://t.co/x5Ctw87c 'data/china_082019_1_tweets_csv_hashed.csv'
## 1076 retweet_userid 1/0/T/F/TRUE/FALSE 2896172507 'data/china_082019_1_tweets_csv_hashed.csv'
## .... ................ .................. ......................................... ...........................................
## See problems(...) for more details.
# users_2 <- read_csv("data/china_082019_2_users_csv_hashed.csv")
# tweets_2 <- read_csv("data/china_082019_2_tweets_csv_hashed.csv")tweets_1 %>% glimpse()## Observations: 1,906,831
## Variables: 31
## $ tweetid <dbl> 4.363064e+17, 5.826830e+17, 5.829979e+1…
## $ userid <dbl> 206027550, 206027550, 206027550, 206027…
## $ user_display_name <chr> "The Flecha", "The Flecha", "The Flecha…
## $ user_screen_name <chr> "nessniven", "nessniven", "nessniven", …
## $ user_reported_location <chr> "Republic of Korea", "Republic of Korea…
## $ user_profile_description <chr> "When you see Jair Bolsonaro and his pr…
## $ user_profile_url <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ follower_count <dbl> 100847, 100847, 100847, 100847, 100847,…
## $ following_count <dbl> 31752, 31752, 31752, 31752, 31752, 3175…
## $ account_creation_date <date> 2010-10-22, 2010-10-22, 2010-10-22, 20…
## $ account_language <chr> "en", "en", "en", "en", "en", "en", "en…
## $ tweet_language <chr> "pt", "pt", "pt", "pt", NA, "pt", "pt",…
## $ tweet_text <chr> "segui o @instagranzin e ganhei 89 segu…
## $ tweet_time <dttm> 2014-02-20 01:08:00, 2015-03-30 23:17:…
## $ tweet_client_name <chr> "TweetDeck", "TweetDeck", "TweetDeck", …
## $ in_reply_to_userid <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ in_reply_to_tweetid <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ quoted_tweet_tweetid <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ is_retweet <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALS…
## $ retweet_userid <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ retweet_tweetid <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ latitude <chr> "absent", "absent", "absent", "absent",…
## $ longitude <chr> "absent", "absent", "absent", "absent",…
## $ quote_count <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, …
## $ reply_count <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 18, 1, 1, 1, 1,…
## $ like_count <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ retweet_count <dbl> 0, 52, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,…
## $ hashtags <chr> NA, "[]", "[]", "[]", NA, "[]", "[]", "…
## $ urls <chr> NA, "['http://goo.gl/32K38q']", "[]", "…
## $ user_mentions <chr> "[443196382]", "[]", "[116351908]", "[]…
## $ poll_choices <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
tweet_time# Filtering RT relation (366,253 out of 1,906.831)
rt_1 <- tweets_1 %>%
select(user_screen_name, tweet_text, tweet_time) %>%
filter(str_detect(tweet_text, "RT @"))
# In each tweet, detecting retweet targets when number-of-RT == 1.
rt_edge_1 <- rt_1 %>%
mutate(rt_count = str_count(tweet_text, "RT @\\w+")) %>%
filter(rt_count == 1) %>%
mutate(to = str_extract(tweet_text, "RT @\\w+")) %>%
mutate(to = str_sub(to, 5, nchar(to))) %>%
select(from = user_screen_name, to, tweet_time)
# In each tweet, detecting retweet targets when number-of-RT > 1.
rt_edge <- rt_1 %>%
mutate(rt_count = str_count(tweet_text, "RT @\\w+")) %>%
filter(rt_count > 1) %>%
mutate(to = str_extract_all(tweet_text, "RT @\\w+")) %>%
unnest(to) %>%
mutate(to = str_sub(to, 5, nchar(to))) %>%
select(from = user_screen_name, to, tweet_time) %>%
bind_rows(rt_edge_1)rt_edge_inner <- rt_edge %>%
filter(to %in% users_1$user_screen_name)library(igraph)
g <- rt_edge_inner %>%
mutate(year = year(tweet_time)) %>%
# filter(year >= 2019) %>%
count(from, to, year) %>%
select(from, to, year, weight = n) %>%
graph.data.frame(directed = T)E(g)$weight## [1] 1 1 2 1 1 1 1 1 1 1 2 1 1 3 1 5 1 1 1 1 1 1 2
## [24] 1 1 2 1 1 9 1 4 3 2 3 3 2 1 2 1 1 1 1 1 1 1 1
## [47] 1 1 1 1 1 5 1 1 1 1 1 1 1 1 1 3 1 1 1 2 3 1 1
## [70] 1 1 1 1 1 1 1 1 4 1 7 7 3 3 4 2 4 3 1 1 1 2 2
## [93] 1 1 5 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 1 5 1 1
## [116] 5 6 2 1 1 2 5 1 1 1 1 1 1 1 3 1 2 7 1 1 1 1 1
## [139] 1 1 1 1 6 1 1 1 1 2 1 1 1 1 2 1 1 2 49 2 2 2 1
## [162] 2 1 1 1 1 1 1 1 1 1 2 1 2 2 3 1 1 2 1 1 1 1 5
## [185] 1 1 1 1 2 1 1 1 1 1 3 1 1 1 1 1 1 1 5 1 5 9 2
## [208] 1 1 3 1 1 3 1 2 2 1 2 2 1 1 1 1 1 1 1 2 1 1 1
## [231] 1 1 1 1 2 1 1 1 1 1 2 2 2 1 1 1 1 1 3 1 2 7 1
## [254] 1 1 1 1 1 2 2 3 1 1 1 2 1 1 1 1 1 1 1 1 1 2 2
## [277] 2 2 4 4 4 4 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1
## [300] 1 2 2 1 1 1 1 1 1 1 2 1 2 2 4 3 1 5 2 5 5 2 1
## [323] 7 4 1 1 1 1 1 3 1 1 1 2 1 4 1 1 1 1 4 1 2 3 1
## [346] 1 1 1 3 3 1 3 1 1 1 1 1 1 2 1 1 1 1 2 1 3 3 1
## [369] 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 3
## [392] 11 4 1 1 1 1 1 2 2 1 2 1 2 1 1 1 1 1 1 1 1 1 1
## [415] 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 16 1 1 1 1 1 1 1
E(g)$year %>% head## [1] 2018 2019 2018 2017 2017 2017
# V(g)
# V(g)$name# Detecting community by information flow
V(g)$comm <- membership(infomap.community(g))
# V(g)$comm
table(V(g)$comm)##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## 9 57 12 20 4 12 10 3 3 10 6 5 2 7 6 6 5 5 4 3 3 3 3 2 2
# centrality degree
V(g)$degree <- degree(g, mode="all")
# in-degree: interaction from out to in.
V(g)$indegree <- degree(g, mode="in")
#out-degree: Interaction from in to out
V(g)$outdegree <- degree(g, mode="out")
# closeness centrality: average of the shortest path length to other nodes
V(g)$closeness <- centralization.closeness(g)$res## Warning in centralization.closeness(g): At centrality.c:2784 :closeness
## centrality is not well-defined for disconnected graphs
# betweenness centrality: bridging degree of vertices
V(g)$betweenness <- centralization.betweenness(g)$resnodes_all <- data.frame(name = V(g)$name,
degree = V(g)$degree,
indegree = V(g)$indegree,
outdegree = V(g)$outdegree,
closeness = V(g)$closeness,
community = V(g)$comm,
betweeness =V(g)$betweenness)
# nodes_all %>% Viewplot(g)plot(g, edge.arrow.size = .4, vertex.label = NA)plot(degree.distribution(g, cumulative=T), pch=20, xlab="degree", ylab="cumulative frequency")l <- layout.fruchterman.reingold(g)
# l <- layout_with_kk(g)
plot(g,
layout = l,
# vertex.color = rgb(1, 1, 0, 0.2),
vertex.color = factor(V(g)$comm),
vertex.size = sqrt(V(g)$degree)*3,
vertex.frame.color= rgb(0, 0, 0, 0.5),
vertex.label = str_sub(V(g)$name, 1, 10),
vertex.label.cex = 0.6,
vertex.label.color = rgb(0, 0, 0, 0.5),
edge.curved = 0.1,
edge.arrow.size = 0.1,
edge.width = sqrt(E(g)$weight+1),
edge.color = E(g)$year,
edge.label = E(g)$year,
edge.label.cex = 0.4,
edge.label.color = E(g)$year
)hk_rt_compare <- hk_tweets %>%
mutate(date0605 = if_else(tweet_time > as.Date("2019-06-05"),
"after", "before")) %>%
filter(str_detect(tweet_text,"RT @")) %>%
select(user_screen_name, tweet_text, date0605)
hk_rt_compare %>% count(date0605)# Building rt edgelist of hk related tweets
hk_rt_edgelist <- hk_rt_compare %>%
mutate(rt = str_extract_all(tweet_text, "RT @\\w+")) %>%
unnest(rt) %>%
mutate(to = str_sub(rt, 5, nchar(rt))) %>%
# filter(to %in% users_1$user_screen_name) %>%
select(from = user_screen_name, to, date0605)# Building graph: converting from edgelist
g_hk <- hk_rt_edgelist %>%
count(from, to, date0605) %>%
select(from, to, date0605, weight = n) %>%
graph.data.frame(directed = T)
# Examing network properties: communities
V(g_hk)$comm <- membership(infomap.community(g_hk))
table(V(g_hk)$comm)##
## 1 2 3 4 5 6 7 8 9
## 147 12 5 4 3 2 2 2 2
# Examing ego-network properties
V(g_hk)$degree <- degree(g_hk, mode="all")
V(g_hk)$indegree <- degree(g_hk, mode="in")
V(g_hk)$outdegree <- degree(g_hk, mode="out")
V(g_hk)$closeness <- centralization.closeness(g_hk)$res## Warning in centralization.closeness(g_hk): At centrality.c:2784 :closeness
## centrality is not well-defined for disconnected graphs
V(g_hk)$betweenness <- centralization.betweenness(g_hk)$res
V(g_hk)$deleted <- if_else(V(g_hk)$name %in% users_1$user_screen_name,
rgb(1, 0.25, 0, 0.3), rgb(0, 0, 0, 0.1))
# Saving network properties into a table
nodes_hk_rt <- data_frame(name = V(g_hk)$name,
degree = V(g_hk)$degree,
indegree = V(g_hk)$indegree,
outdegree = V(g_hk)$outdegree,
closeness = V(g_hk)$closeness,
community = V(g_hk)$comm,
betweeness =V(g_hk)$betweenness,
deleted = V(g_hk)$deleted)## Warning: `data_frame()` is deprecated, use `tibble()`.
## This warning is displayed once per session.
# Plotting network
plot(g_hk)plot(g_hk, edge.arrow.size=.4,vertex.label=NA)plot(degree.distribution(g_hk, cumulative=T), pch=20,xlab="degree", ylab="cumulative frequency")# Better network plotting
l <- layout.fruchterman.reingold.grid(g_hk)## Warning in layout.fruchterman.reingold.grid(g_hk): Grid Fruchterman-Reingold layout was removed,
## we use Fruchterman-Reingold instead.
plot(g_hk,
layout = l,
vertex.color = V(g_hk)$deleted,
vertex.size = sqrt(V(g_hk)$degree)*2,
vertex.frame.color= rgb(0, 0, 0, 0.5),
vertex.label = str_sub(V(g_hk)$name, 1, 10),
vertex.label.cex = 0.6,
vertex.label.color = rgb(0, 0, 0, 0.5),
edge.curved = .1,
edge.arrow.size = .1,
edge.width = E(g_hk)$weight/2,
edge.color = factor(E(g_hk)$date0605)
)